import numpy as np
import pandas as pd
import xgboost as xgb
import plotly
import sklearn
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
pd.options.plotting.backend = "plotly"
epldf = pd.read_csv('EPL Standings 2000-2022.csv')
epldf.tail(20)
| Season | Pos | Team | Pld | W | D | L | GF | GA | GD | Pts | Qualification or relegation | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 420 | 2021-22 | 1 | Manchester City | 38 | 29 | 6 | 3 | 99 | 26 | 73 | 93 | Qualification for the Champions League group s... |
| 421 | 2021-22 | 2 | Liverpool | 38 | 28 | 8 | 2 | 94 | 26 | 68 | 92 | Qualification for the Champions League group s... |
| 422 | 2021-22 | 3 | Chelsea | 38 | 21 | 11 | 6 | 76 | 33 | 43 | 74 | Qualification for the Champions League group s... |
| 423 | 2021-22 | 4 | Tottenham Hotspur | 38 | 22 | 5 | 11 | 69 | 40 | 29 | 71 | Qualification for the Champions League group s... |
| 424 | 2021-22 | 5 | Arsenal | 38 | 22 | 3 | 13 | 61 | 48 | 13 | 69 | Qualification for the Europa League group stag... |
| 425 | 2021-22 | 6 | Manchester United | 38 | 16 | 10 | 12 | 57 | 57 | 0 | 58 | Qualification for the Europa League group stag... |
| 426 | 2021-22 | 7 | West Ham United | 38 | 16 | 8 | 14 | 60 | 51 | 9 | 56 | Qualification for the Europa Conference League... |
| 427 | 2021-22 | 8 | Leicester City | 38 | 14 | 10 | 14 | 62 | 59 | 3 | 52 | Not Applicable |
| 428 | 2021-22 | 9 | Brighton & Hove Albion | 38 | 12 | 15 | 11 | 42 | 44 | -2 | 51 | Not Applicable |
| 429 | 2021-22 | 10 | Wolverhampton Wanderers | 38 | 15 | 6 | 17 | 38 | 43 | -5 | 51 | Not Applicable |
| 430 | 2021-22 | 11 | Newcastle United | 38 | 13 | 10 | 15 | 44 | 62 | -18 | 49 | Not Applicable |
| 431 | 2021-22 | 12 | Crystal Palace | 38 | 11 | 15 | 12 | 50 | 46 | 4 | 48 | Not Applicable |
| 432 | 2021-22 | 13 | Brentford | 38 | 13 | 7 | 18 | 48 | 56 | -8 | 46 | Not Applicable |
| 433 | 2021-22 | 14 | Aston Villa | 38 | 13 | 6 | 19 | 52 | 54 | -2 | 45 | Not Applicable |
| 434 | 2021-22 | 15 | Southampton | 38 | 9 | 13 | 16 | 43 | 67 | -24 | 40 | Not Applicable |
| 435 | 2021-22 | 16 | Everton | 38 | 11 | 6 | 21 | 43 | 66 | -23 | 39 | Not Applicable |
| 436 | 2021-22 | 17 | Leeds United | 38 | 9 | 11 | 18 | 42 | 79 | -37 | 38 | Not Applicable |
| 437 | 2021-22 | 18 | Burnley | 38 | 7 | 14 | 17 | 34 | 53 | -19 | 35 | Relegation to the EFL Championship |
| 438 | 2021-22 | 19 | Watford | 38 | 6 | 5 | 27 | 34 | 77 | -43 | 23 | Relegation to the EFL Championship |
| 439 | 2021-22 | 20 | Norwich City | 38 | 5 | 7 | 26 | 23 | 84 | -61 | 22 | Relegation to the EFL Championship |
epldf.shape
(440, 12)
# only keep relevant numeric data
num_epldf= epldf[['Pos','W','D','L','GF','GA','GD','Pts']]
sns.heatmap(num_epldf.corr(),cmap="YlGnBu")
<AxesSubplot:>
X = epldf[['W','D','L','GF','GA','GD']]
Y = epldf['Pts']
X_train = X.iloc[:420]
y_train = Y.iloc[:420]
X_test = X.iloc[-20:]
y_test = Y.iloc[-20:]
model = xgb.XGBRegressor()
from sklearn.model_selection import GridSearchCV
# set up search grid
param_grid = {"max_depth": [3,4, 5,6],
"learning_rate": [0.01, 0.015,0.02]}
# try out every combination of the above values
search = GridSearchCV(model, param_grid, cv=5).fit(X_train, y_train)
print("The best parameters are: ", search.best_params_)
The best parameters are: {'learning_rate': 0.02, 'max_depth': 3}
model=xgb.XGBRegressor(learning_rate = search.best_params_["learning_rate"],
max_depth = search.best_params_["max_depth"],)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
predictions
array([77.04239 , 76.1504 , 63.72129 , 62.976234, 61.335968, 50.8153 ,
50.074333, 45.032627, 43.991783, 43.726967, 41.496918, 41.486866,
40.219643, 38.69451 , 34.270206, 34.454823, 32.411 , 31.32315 ,
19.110287, 20.745018], dtype=float32)
import plotly.graph_objects as go
# plotly plots not showing on pdf/html output
# go.renderers.default = "plotly_mimetype+notebook"
position = epldf.iloc[:20]['Pos']
points2122 = epldf.iloc[:20]['Pts']
fig = go.Figure()
fig.add_trace(go.Bar(x=position,
y=predictions,
name='Predicted',
marker_color='rgb(55, 83, 109)'
))
fig.add_trace(go.Bar(x=position,
y=points2122,
name='Actual',
marker_color='rgb(26, 118, 255)'
))
fig.add_trace(go.Bar(x=position,
y=points2122-predictions,
name='Residual',
marker_color='Red'
))
fig.update_layout(
title='Predicted vs Actual Points (2021-2022 season)',
xaxis=dict(
title='Position',
tickfont_size=14,
),
yaxis=dict(
title='Points',
titlefont_size=16,
tickfont_size=14
),
legend=dict(
x=1.00,
y=1.0,
bgcolor='rgba(255, 255, 255, 0)',
bordercolor='rgba(255, 255, 255, 0)'
),
barmode='group',
bargap=0.15, # gap between bars of adjacent location coordinates.
bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()
mse2122 = mean_squared_error(points2122,predictions)
rmse2122 = np.sqrt(mse2122)
avg_abs_diff = mean_absolute_error(points2122,predictions)
print("The MSE and RMSE are %f and %f.\n Average of aboslute values of the difference: %f" %(mse2122,rmse2122,avg_abs_diff))
The MSE and RMSE are 58.472033 and 7.646701. Average of aboslute values of the difference: 7.111054
def metrics(pred,act):
mse = mean_squared_error(pred,act)
print("The MSE and RMSE are %f and %f.\n Average of aboslute values of the difference: %f" %(mse,np.sqrt(mse),mean_absolute_error(act,pred)))
# set up new search grid
param_grid = {"max_depth": [3,4,5,6,7],
"learning_rate": [0.005,0.01,0.015,0.02,0.03,.1]}
# try out every combination of the above values
search = GridSearchCV(model, param_grid, cv=5).fit(X_train, y_train)
print("The best parameters are: ", search.best_params_)
The best parameters are: {'learning_rate': 0.1, 'max_depth': 4}
model=xgb.XGBRegressor(learning_rate = search.best_params_["learning_rate"],
max_depth = search.best_params_["max_depth"],)
model.fit(X_train, y_train)
predictions_2 = model.predict(X_test)
metrics(predictions_2,points2122)
The MSE and RMSE are 49.948304 and 7.067411. Average of aboslute values of the difference: 4.596758
fig = go.Figure()
fig.add_trace(go.Bar(x=position,
y=predictions_2,
name='Predicted',
marker_color='rgb(55, 83, 109)'
))
fig.add_trace(go.Bar(x=position,
y=points2122,
name='Actual',
marker_color='rgb(26, 118, 255)'
))
fig.add_trace(go.Bar(x=position,
y=points2122-predictions_2,
name='Residual',
marker_color='Red'
))
fig.update_layout(
title='Predicted vs Actual Points (2021-2022 season)',
xaxis=dict(
title='Position',
tickfont_size=14,
),
yaxis=dict(
title='Points',
titlefont_size=16,
tickfont_size=14
),
legend=dict(
x=1.00,
y=1.0,
bgcolor='rgba(255, 255, 255, 0)',
bordercolor='rgba(255, 255, 255, 0)'
),
barmode='group',
bargap=0.15, # gap between bars of adjacent location coordinates.
bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()
URL = 'https://projects.fivethirtyeight.com/soccer-api/club/spi_matches.csv'
data = pd.read_csv(URL)
# example data point
data.iloc[60000]
season 2022 date 2022-10-19 league_id 2411 league Barclays Premier League team1 Manchester United team2 Tottenham Hotspur spi1 77.12 spi2 81.52 prob1 0.3802 prob2 0.384 probtie 0.2358 proj_score1 1.56 proj_score2 1.57 importance1 65.2 importance2 70.0 score1 2.0 score2 0.0 xg1 1.72 xg2 0.9 nsxg1 2.2 nsxg2 1.18 adj_score1 2.1 adj_score2 0.0 Name: 60000, dtype: object
# plotly.offline.init_notebook_mode()